{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "文章链接"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://mp.weixin.qq.com/s?__biz=MzI0NTQ1NDc4Nw==&mid=2247485377&idx=1&sn=f815be5e0f78af78c9fe79b4cb71f879&chksm=e94f056dde388c7b8e4d7466ec96ea71aac5a003d15c4235a33ebb0c354a2c4032a0c2af6009&scene=21#wechat_redirect"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "import logging\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "文件夹\"C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\" 将被用来存储语料和临时性的字典\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import tempfile\n",
    "TEMP_FOLDER = tempfile.gettempdir()\n",
    "print('文件夹\"{}\" 将被用来存储语料和临时性的字典'.format(TEMP_FOLDER))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 一、从字符串到向量（From Strings to Vectors）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "from gensim import corpora\n",
    "import jieba"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "2020-11-10 19:29:28,158 : DEBUG : Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "2020-11-10 19:29:28,166 : DEBUG : Loading model from cache C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 1.678 seconds.\n",
      "2020-11-10 19:29:29,843 : DEBUG : Loading model cost 1.678 seconds.\n",
      "Prefix dict has been built successfully.\n",
      "2020-11-10 19:29:29,846 : DEBUG : Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "jieba.add_word('知识图谱') #防止“知识图谱”被切错词\n",
    "\n",
    "docs = ['商业新知:知识图谱为内核,构建商业创新服务完整生态。',\n",
    "'如何更好利用知识图谱技术做反欺诈? 360金融首席数据科学家沈赟开讲。',\n",
    "'知识管理 | 基于知识图谱的国际知识管理领域可视化分析。',\n",
    "'一文详解达观数据知识图谱技术与应用。',\n",
    "'知识图谱技术落地金融行业的关键四步。',\n",
    "'一文读懂知识图谱的商业应用进程及技术背景。',\n",
    "'海云数据CPO王斌:打造大数据可视分析与AI应用的高科技企业。',\n",
    "'智能产业|《人工智能标准化白皮书2018》带来创新创业新技术标准。',\n",
    "'国家语委重大科研项目“中华经典诗词知识图谱构建技术研究”开题。',\n",
    "'最全知识图谱介绍:关键技术、开放数据集、应用案例汇总。',\n",
    "'中译语通Jove Mind知识图谱平台 引领企业智能化发展。',\n",
    "'知识图谱:知识图谱赋能企业数字化转型，为企业升级转型注入新能量。']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 前面省略，从下面直奔主题，举个代码例子：\n",
    "for i in range(len(docs)):\n",
    "    result2txt=str(docs[i])          # data是前面运行出的数据，先将其转为字符串才能写入    \n",
    "    with open('mycorpus.txt','a') as file_handle:   # .txt可以不自己新建,代码会自动新建\n",
    "        file_handle.write(result2txt)     # 写入\n",
    "        file_handle.write('\\n')         \n",
    "    # 有时放在循环里面需要自动转行，不然会覆盖上一条数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['商业 新知 : 知识图谱 为 内核 , 构建 商业 创新 服务 完整 生态 。',\n",
       " '如何 更好 利用 知识图谱 技术 做 反 欺诈 ?   360 金融 首席 数据 科学家 沈赟 开讲 。',\n",
       " '知识 管理   |   基于 知识图谱 的 国际 知识 管理 领域 可视化 分析 。',\n",
       " '一文 详解 达观 数据 知识图谱 技术 与 应用 。',\n",
       " '知识图谱 技术 落地 金融 行业 的 关键 四步 。',\n",
       " '一文 读懂 知识图谱 的 商业 应用 进程 及 技术 背景 。',\n",
       " '海云 数据 CPO 王斌 : 打造 大 数据 可视 分析 与 AI 应用 的 高科技 企业 。',\n",
       " '智能 产业 | 《 人工智能 标准化 白皮书 2018 》 带来 创新 创业 新 技术标准 。',\n",
       " '国家语委 重大 科研项目 “ 中华 经典 诗词 知识图谱 构建 技术 研究 ” 开题 。',\n",
       " '最全 知识图谱 介绍 : 关键技术 、 开放 数据 集 、 应用 案例 汇总 。',\n",
       " '中译 语通 Jove   Mind 知识图谱 平台   引领 企业 智能化 发展 。',\n",
       " '知识图谱 : 知识图谱 赋能 企业 数字化 转型 ， 为 企业 升级 转型 注入 新 能量 。']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "documents = [' '.join(jieba.lcut(i)) for i in docs]\n",
    "documents "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[['商业', '知识图谱', '构建', '商业', '创新'],\n",
      " ['知识图谱', '技术', '金融', '数据'],\n",
      " ['知识', '管理', '知识图谱', '知识', '管理', '分析'],\n",
      " ['一文', '数据', '知识图谱', '技术'],\n",
      " ['知识图谱', '技术', '金融'],\n",
      " ['一文', '知识图谱', '商业', '技术'],\n",
      " ['数据', '数据', '分析', '企业'],\n",
      " ['创新', '新'],\n",
      " ['知识图谱', '构建', '技术'],\n",
      " ['知识图谱', '数据'],\n",
      " ['知识图谱', '企业'],\n",
      " ['知识图谱', '知识图谱', '企业', '转型', '企业', '转型', '新']]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# 移除常用词以及分词\n",
    "stoplist = [i.strip() for i in open('chineseStopWords.txt',encoding='utf-8').readlines()]\n",
    "texts = [[word for word in document.lower().split() if word not in stoplist]\n",
    "for document in documents]\n",
    "\n",
    "\n",
    "# 移除仅出现一次的词汇\n",
    "from collections import defaultdict\n",
    "frequency = defaultdict(int)\n",
    "for text in texts:\n",
    "    for token in text:\n",
    "            frequency[token] += 1\n",
    "\n",
    "\n",
    "texts = [[token for token in text if frequency[token] > 1] for text in texts]\n",
    "\n",
    "\n",
    "from pprint import pprint  #使打印的格式更齐整\n",
    "pprint(texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-11-10 19:30:08,433 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n",
      "2020-11-10 19:30:08,436 : INFO : built Dictionary(14 unique tokens: ['创新', '商业', '构建', '知识图谱', '技术']...) from 12 documents (total 46 corpus positions)\n",
      "2020-11-10 19:30:08,441 : INFO : saving Dictionary object under C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\deerwester.dict, separately None\n",
      "2020-11-10 19:30:08,445 : INFO : saved C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\deerwester.dict\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dictionary(14 unique tokens: ['创新', '商业', '构建', '知识图谱', '技术']...)\n"
     ]
    }
   ],
   "source": [
    "dictionary = corpora.Dictionary(texts)\n",
    "dictionary.save(os.path.join(TEMP_FOLDER, 'deerwester.dict'))  # 保存字典，以备后续查找之用\n",
    "print(dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'创新': 0, '商业': 1, '构建': 2, '知识图谱': 3, '技术': 4, '数据': 5, '金融': 6, '分析': 7, '知识': 8, '管理': 9, '一文': 10, '企业': 11, '新': 12, '转型': 13}\n"
     ]
    }
   ],
   "source": [
    "\n",
    "print(dictionary.token2id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(3, 1), (11, 1), (13, 1)]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "new_doc = \"知识图谱 为 企业 转型 助力\"\n",
    "new_vec = dictionary.doc2bow(new_doc.lower().split())\n",
    "print(new_vec)  # “为”、“助力”等词汇未出现在字典中，因而被忽略"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-11-10 19:30:34,236 : INFO : storing corpus in Matrix Market format to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\deerwester.mm\n",
      "2020-11-10 19:30:34,241 : INFO : saving sparse matrix to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\deerwester.mm\n",
      "2020-11-10 19:30:34,243 : INFO : PROGRESS: saving document #0\n",
      "2020-11-10 19:30:34,247 : INFO : saved 12x14 matrix, density=23.214% (39/168)\n",
      "2020-11-10 19:30:34,251 : INFO : saving MmCorpus index to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\deerwester.mm.index\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 1), (1, 2), (2, 1), (3, 1)]\n",
      "[(3, 1), (4, 1), (5, 1), (6, 1)]\n",
      "[(3, 1), (7, 1), (8, 2), (9, 2)]\n",
      "[(3, 1), (4, 1), (5, 1), (10, 1)]\n",
      "[(3, 1), (4, 1), (6, 1)]\n",
      "[(1, 1), (3, 1), (4, 1), (10, 1)]\n",
      "[(5, 2), (7, 1), (11, 1)]\n",
      "[(0, 1), (12, 1)]\n",
      "[(2, 1), (3, 1), (4, 1)]\n",
      "[(3, 1), (5, 1)]\n",
      "[(3, 1), (11, 1)]\n",
      "[(3, 2), (11, 2), (12, 1), (13, 2)]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "corpus = [dictionary.doc2bow(text) for text in texts]\n",
    "corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'deerwester.mm'), corpus)  #保存到本地，以作后用\n",
    "for c in corpus:\n",
    "    print(c)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 二、语料库流（Corpus Streaming） --- 每次仅调用一个文档"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "from smart_open import smart_open\n",
    "class MyCorpus(object):\n",
    "    def __iter__(self):\n",
    "        for line in smart_open('mycorpus.txt', 'r',encoding='utf-8'):\n",
    "        # 假设每一行一个文档，用jieba进行分词\n",
    "            yield dictionary.doc2bow(' '.join(jieba.lcut(line)).lower().split())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<__main__.MyCorpus object at 0x00000270678EDBE0>\n"
     ]
    }
   ],
   "source": [
    "corpus_memory_friendly = MyCorpus()  #不需要将语料载入到内存中!\n",
    "print(corpus_memory_friendly)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 2), (1, 3), (2, 2), (3, 11), (4, 5), (5, 5), (6, 2), (7, 2), (8, 2), (9, 2), (10, 2), (11, 4), (12, 2), (13, 2)]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "for vector in corpus_memory_friendly:  #每次载入一个文档向量\n",
    "    print(vector)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-11-10 19:37:33,450 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n",
      "2020-11-10 19:37:33,459 : INFO : built Dictionary(105 unique tokens: [',', ':', '。', '为', '内核']...) from 12 documents (total 161 corpus positions)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dictionary(11 unique tokens: ['创新', '商业', '构建', '知识图谱', '技术']...)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "from six import iteritems\n",
    "from smart_open import smart_open\n",
    "#收集所有词汇的统计信息\n",
    "dictionary = corpora.Dictionary(' '.join(jieba.lcut(line)).lower().split() for line in smart_open('mycorpus.txt', 'r',encoding='utf-8'))\n",
    "\n",
    "#停用词和低频词（这里指仅出现1次的词汇）的ID集合\n",
    "stop_ids = [dictionary.token2id[stopword] for stopword in stoplist \n",
    "if stopword in dictionary.token2id]\n",
    "once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]\n",
    "\n",
    "#真正实施去停用词和低频次的操作\n",
    "dictionary.filter_tokens(stop_ids + once_ids)\n",
    "print(dictionary)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 三、语料格式（Corpus Formats）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-11-10 19:37:53,685 : INFO : storing corpus in Matrix Market format to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.mm\n",
      "2020-11-10 19:37:53,689 : INFO : saving sparse matrix to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.mm\n",
      "2020-11-10 19:37:53,691 : INFO : PROGRESS: saving document #0\n",
      "2020-11-10 19:37:53,694 : INFO : saved 2x2 matrix, density=25.000% (1/4)\n",
      "2020-11-10 19:37:53,697 : INFO : saving MmCorpus index to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.mm.index\n"
     ]
    }
   ],
   "source": [
    "\n",
    "#创建一个包含2个文档的微小语料，以一个python列表呈现\n",
    "\n",
    "corpus = [[(1, 0.5)], []]  # 其中一个文档故意搞成空的\n",
    "\n",
    "corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.mm'), corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-11-10 19:38:02,971 : INFO : converting corpus to SVMlight format: C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.svmlight\n",
      "2020-11-10 19:38:02,979 : INFO : saving SvmLightCorpus index to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.svmlight.index\n",
      "2020-11-10 19:38:02,983 : INFO : no word id mapping provided; initializing from corpus\n",
      "2020-11-10 19:38:02,985 : INFO : storing corpus in Blei's LDA-C format into C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.lda-c\n",
      "2020-11-10 19:38:02,989 : INFO : saving vocabulary of 2 words to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.lda-c.vocab\n",
      "2020-11-10 19:38:02,993 : INFO : saving BleiCorpus index to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.lda-c.index\n",
      "2020-11-10 19:38:02,996 : INFO : no word id mapping provided; initializing from corpus\n",
      "2020-11-10 19:38:02,998 : INFO : storing corpus in List-Of-Words format into C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.low\n",
      "2020-11-10 19:38:03,004 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value\n",
      "2020-11-10 19:38:03,007 : INFO : saving LowCorpus index to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.low.index\n"
     ]
    }
   ],
   "source": [
    "\n",
    "corpora.SvmLightCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.svmlight'), corpus)\n",
    "corpora.BleiCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.lda-c'), corpus)\n",
    "corpora.LowCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.low'), corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-11-10 19:38:09,285 : INFO : loaded corpus index from C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.mm.index\n",
      "2020-11-10 19:38:09,288 : INFO : initializing cython corpus reader from C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.mm\n",
      "2020-11-10 19:38:09,292 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries\n"
     ]
    }
   ],
   "source": [
    "corpus = corpora.MmCorpus(os.path.join(TEMP_FOLDER, 'corpus.mm'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MmCorpus(2 documents, 2 features, 1 non-zero entries)\n"
     ]
    }
   ],
   "source": [
    "\n",
    "print(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[(1, 0.5)], []]\n"
     ]
    }
   ],
   "source": [
    "# 一种打印语料库的方式是 --- 将其整个载入内存中\n",
    "print(list(corpus))  #  调用 list() 能将任何序列转化为普通的Python list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[(1, 0.5)], []]\n"
     ]
    }
   ],
   "source": [
    "# 一种打印语料库的方式是 --- 将其整个载入内存中\n",
    "print(list(corpus))  #  调用 list() 能将任何序列转化为普通的Python list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(1, 0.5)]\n",
      "[]\n"
     ]
    }
   ],
   "source": [
    "\n",
    "# 另一种方法：一次打印一个文档\n",
    "for doc in corpus:\n",
    "    print(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2020-11-10 19:38:51,377 : INFO : no word id mapping provided; initializing from corpus\n",
      "2020-11-10 19:38:51,382 : INFO : storing corpus in Blei's LDA-C format into C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.lda-c\n",
      "2020-11-10 19:38:51,386 : INFO : saving vocabulary of 2 words to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.lda-c.vocab\n",
      "2020-11-10 19:38:51,389 : INFO : saving BleiCorpus index to C:\\Users\\ADMINI~1\\AppData\\Local\\Temp\\corpus.lda-c.index\n"
     ]
    }
   ],
   "source": [
    "\n",
    "corpora.BleiCorpus.serialize(os.path.join(TEMP_FOLDER, 'corpus.lda-c'), corpus)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 四、与NumPy、SciPy的兼容性"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "import gensim\n",
    "import numpy as np\n",
    "numpy_matrix = np.random.randint(10, size=[5,2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[2, 6],\n",
       "       [1, 0],\n",
       "       [8, 2],\n",
       "       [2, 2],\n",
       "       [0, 8]])"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "numpy_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "corpus = gensim.matutils.Dense2Corpus(numpy_matrix)\n",
    "numpy_matrix_dense = gensim.matutils.corpus2dense(corpus, num_terms=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[2., 6.],\n",
       "       [1., 0.],\n",
       "       [8., 2.],\n",
       "       [2., 2.],\n",
       "       [0., 8.],\n",
       "       [0., 0.],\n",
       "       [0., 0.],\n",
       "       [0., 0.],\n",
       "       [0., 0.],\n",
       "       [0., 0.]], dtype=float32)"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "numpy_matrix_dense"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "import scipy.sparse\n",
    "scipy_sparse_matrix = scipy.sparse.random(5,2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<5x2 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 0 stored elements in COOrdinate format>"
      ]
     },
     "execution_count": 42,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "scipy_sparse_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "corpus = gensim.matutils.Sparse2Corpus(scipy_sparse_matrix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<gensim.matutils.Sparse2Corpus at 0x2706795e7f0>"
      ]
     },
     "execution_count": 44,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "corpus"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "scipy_csc_matrix = gensim.matutils.corpus2csc(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<0x2 sparse matrix of type '<class 'numpy.float64'>'\n",
       "\twith 0 stored elements in Compressed Sparse Column format>"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "scipy_csc_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
